home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Collection of Tools & Utilities
/
Collection of Tools and Utilities.iso
/
edit
/
mg2a_src.zip
/
REGEX.C
< prev
next >
Wrap
C/C++ Source or Header
|
1988-08-23
|
45KB
|
1,595 lines
/* Imagine Generic gnuemacs copyright notice here */
/* modified by Robert Larson to make fewer assumptions about the compiler */
/* (Making it useful to mg) (#if not supported by osk, enum not supported by many) */
/* To test, compile with -Dtest.
This Dtestable feature turns this into a self-contained program
which reads a pattern, describes how it compiles,
then reads a string and searches for it. */
#ifdef REGEX
#include "def.h" /* defines VOID etc. for mg */
#ifdef emacs
/* The `emacs' switch turns on certain special matching commands
that make sense only in emacs. */
#include "config.h"
#include "lisp.h"
#include "buffer.h"
#include "syntax.h"
#else /* not emacs */
/*
* Define the syntax stuff, so we can do the \<...\> things.
*/
#ifndef Sword /* must be non-zero in some of the tests below... */
#define Sword 1
#endif
#define SYNTAX(c) re_syntax_table[c]
#ifdef SYNTAX_TABLE
char *re_syntax_table;
#else
static char re_syntax_table[256];
static VOID
init_syntax_once ()
{
register int c;
static int done = 0;
if (done)
return;
bzero (re_syntax_table, sizeof re_syntax_table);
for (c = 'a'; c <= 'z'; c++)
re_syntax_table[c] = Sword;
for (c = 'A'; c <= 'Z'; c++)
re_syntax_table[c] = Sword;
for (c = '0'; c <= '9'; c++)
re_syntax_table[c] = Sword;
done = 1;
}
#endif /* SYNTAX_TABLE */
#endif /* not emacs */
#include "regex.h"
/* Number of failure points to allocate space for initially,
when matching. If this number is exceeded, more space is allocated,
so it is not a hard limit. */
#ifndef NFAILURES
#define NFAILURES 80
#endif NFAILURES
/* width of a byte in bits */
#define BYTEWIDTH 8
/* These are the command codes that appear in compiled regular expressions, one per byte.
Some command codes are followed by argument bytes.
A command code can specify any interpretation whatever for its arguments.
Zero-bytes may appear in the compiled regular expression. */
typedef char regexpcode;
#define unused 0
#define exactn 1 /* followed by one byte giving n, and then by n literal bytes */
#define begline 2 /* fails unless at beginning of line */
#define endline 3 /* fails unless at end of line */
#define jump 4 /* followed by two bytes giving relative address to jump to */
#define on_failure_jump 5 /* followed by two bytes giving relative address of place */
/* to resume at in case of failure. */
#define finalize_jump 6 /* Throw away latest failure point and then jump to address. */
#define maybe_finalize_jump 7 /* Like jump but finalize if safe to do so. */
/* This is used to jump back to the beginning
of a repeat. If the command that follows
this jump is clearly incompatible with the
one at the beginning of the repeat, such that
we can be sure that there is no use backtracking
out of repetitions already completed,
then we finalize. */
#define dummy_failure_jump 8 /* jump, and push a dummy failure point. */
/* This failure point will be thrown away
if an attempt is made to use it for a failure.
A + construct makes this before the first repeat. */
#define anychar 9 /* matches any one character */
#define charset 10 /* matches any one char belonging to specified set. */
/* First following byte is # bitmap bytes.
Then come bytes for a bit-map saying which chars are in.
Bits in each byte are ordered low-bit-first.
A character is in the set if its bit is 1.
A character too large to have a bit in the map
is automatically not in the set */
#define charset_not 11 /* similar but match any character that is NOT one of those specified */
#define start_memory 12 /* starts remembering the text that is matched */
/* and stores it in a memory register.
followed by one byte containing the register number.
Register numbers must be in the range 0 through NREGS. */
#define stop_memory 13 /* stops remembering the text that is matched */
/* and stores it in a memory register.
followed by one byte containing the register number.
Register numbers must be in the range 0 through NREGS. */
#define duplicate 14 /* match a duplicate of something remembered. */
/* Followed by one byte containing the index of the memory register. */
#define before_dot 15 /* Succeeds if before dot */
#define at_dot 16 /* Succeeds if at dot */
#define after_dot 17 /* Succeeds if after dot */
#define begbuf 18 /* Succeeds if at beginning of buffer */
#define endbuf 19 /* Succeeds if at end of buffer */
#define wordchar 20 /* Matches any word-constituent character */
#define notwordchar 21 /* Matches any char that is not a word-constituent */
#define wordbeg 22 /* Succeeds if at word beginning */
#define wordend 23 /* Succeeds if at word end */
#define wordbound 24 /* Succeeds if at a word boundary */
#define notwordbound 25 /* Succeeds if not at a word boundary */
#define syntaxspec 26 /* Matches any character whose syntax is specified. */
/* followed by a byte which contains a syntax code, Sword or such like */
#define notsyntaxspec 27 /* Matches any character whose syntax differs from the specified. */
#ifndef SIGN_EXTEND_CHAR
#define SIGN_EXTEND_CHAR(x) (x)
#endif
/* compile_pattern takes a regular-expression descriptor string in the user's format
and converts it into a buffer full of byte commands for matching.
pattern is the address of the pattern string
size is the length of it.
bufp is a struct re_pattern_buffer * which points to the info
on where to store the byte commands.
This structure contains a char * which points to the
actual space, which should have been obtained with malloc.
compile_pattern may use realloc to grow the buffer space.
The number of bytes of commands can be found out by looking in
the struct re_pattern_buffer that bufp pointed to,
after compile_pattern returns.
*/
#define PATPUSH(ch) (*b++ = (char) (ch))
#define PATFETCH(c) \
{if (p == pend) goto end_of_pattern; \
c = * (unsigned char *) p++; \
if (translate) c = translate[c]; }
#define PATFETCH_RAW(c) \
{if (p == pend) goto end_of_pattern; \
c = * (unsigned char *) p++; }
#define PATUNFETCH p--
#define EXTEND_BUFFER \
{ char *old_buffer = bufp->buffer; \
if (bufp->allocated == (1<<16)) goto too_big; \
bufp->allocated *= 2; \
if (bufp->allocated > (1<<16)) bufp->allocated = (1<<16); \
if (!(bufp->buffer = (char *) realloc (bufp->buffer, bufp->allocated))) \
goto memory_exhausted; \
c = bufp->buffer - old_buffer; \
b += c; \
if (fixup_jump) \
fixup_jump += c; \
if (laststart) \
laststart += c; \
begalt += c; \
if (pending_exact) \
pending_exact += c; \
}
static int store_jump (), insert_jump ();
char *
re_compile_pattern (pattern, size, bufp)
char *pattern;
int size;
struct re_pattern_buffer *bufp;
{
register char *b = bufp->buffer;
register char *p = pattern;
char *pend = pattern + size;
register unsigned c, c1;
char *p1;
unsigned char *translate = (unsigned char *) bufp->translate;
/* address of the count-byte of the most recently inserted "exactn" command.
This makes it possible to tell whether a new exact-match character
can be added to that command or requires a new "exactn" command. */
char *pending_exact = 0;
/* address of the place where a forward-jump should go
to the end of the containing expression.
Each alternative of an "or", except the last, ends with a forward-jump
of this sort. */
char *fixup_jump = 0;
/* address of start of the most recently finished expression.
This tells postfix * where to find the start of its operand. */
char *laststart = 0;
/* In processing a repeat, 1 means zero matches is allowed */
char zero_times_ok;
/* In processing a repeat, 1 means many matches is allowed */
char many_times_ok;
/* addr